#IMPORT (wrangled) data 
df_raw <- read_csv("data/CLEAN_UtterancesForAnalysis.csv") 

#WRANGLE into DF of coded utterances 
#NOT unique utterances, 1 obs for each utterance+detail-code
df_coded <- df_raw %>% 
  #rename and factorize cols
  mutate(
    #UNIQUE IDS
    sid = factor(SID), #unique ID for utterance+detail-code
    pid = factor(PID, levels = c( #define level order so happiness first
      #HAPPINESS-FIRST    
      "bjs827ee1u", "3r2sh20ei", "4728sjuiz","7ACC0B75","92ghd48xe","iurmer289", "s294hoei",
      #SPACE-FIRST    
      "j2719eertu2","lkin27js09b","li832lin23","7382kwtue","E1D39056","8v892iige")),   
    #create unique ID for utterances
    uid = factor(as.numeric(factor(paste(pid,factor(Utterance))))), #construct a unique ID for utterances
    #recode lower case and order based on true task order
    TASK = factor(recode(Condition, "Static"="static", "Interactive"="ixn" )),
    TASK = factor(TASK, levels = c("static", "ixn")), #reorder factor levels
    #rename Notebook as DATASET
    DATASET = factor(recode(Notebook, "Happiness"="happiness", "Space"="space")),
    #create temp dataset order var
    data_order = factor(paste(TASK,"_",DATASET)), #create an order var 
    data_order = recode(data_order, "ixn _ happiness"="space-first",
                                    "ixn _ space"="happiness-first",
                                    "static _ happiness"="happiness-first",
                                    "static _ space"="space-first"),
    utterance = Utterance,
    reps_group = factor(`group`),
    reps_all = factor(`All representations`),
    #rename flags
    flag_story = `Flag Storytelling`,
    flag_correction = `Flag Correction`,
    flag_simultaneous = `Flag Simultaneous Characterization`,
    #recode and order TOP LEVEL CODES 
    code_topic = factor(Highlevel),
    code_topic = recode(code_topic, "ANALYSIS PROCESS" = "PROCESS"),
    code_topic = factor(code_topic, levels = c("PROCESS","DATASET","VARIABLE","RELATIONSHIP")),
    code_datatype = factor(`Data Type`),
    code_detail = factor(`Utterance Type`),
    timestamp = adj_timestamp,
    ixn = factor(interaction_used), #was interaction used?
    PNUM = factor(PNUM,levels = c("P6", "P9", "P10", "P2", "P4", "P12","P13", 
                                   "P5", "P7", "P8", "P3", "P1","P11")),
    
    ) %>% 
  select(sid,pid,PNUM,uid,TASK,DATASET,timestamp,ixn,code_topic,code_detail,code_datatype,
         flag_story, flag_correction, flag_simultaneous, utterance, reps_group, reps_all, data_order) %>% 
  arrange(data_order)

#REPLACE NA in logicals to FALSE  
df_coded$flag_story[is.na(df_coded$flag_story)] <- FALSE
df_coded$flag_correction[is.na(df_coded$flag_correction)] <- FALSE
df_coded$flag_simultaneous[is.na(df_coded$flag_simultaneous)] <- FALSE



#CALCULATE RELATIVE TASK TIMES
df_time <- df_coded %>% mutate(
  time = hms::as_hms(timestamp)
) %>% group_by(pid, TASK) %>% 
  # dplyr::summarise( .groups="keep",
  mutate(
    task_start = hms::as_hms(min(time)),
    task_end = hms::as_hms(max(time)),
    task_mins = round(difftime(task_end,task_start, units="mins"),1),
    task_second = task_end - task_start,
    relative_time = timestamp-task_start,
    rel_time = as.double(relative_time)
  ) %>% ungroup() %>% select(pid,PNUM, code_topic,code_detail, TASK,DATASET,timestamp,task_start,relative_time,rel_time)

**There are 742 rows in the df_coded dataset, where each row represents an utterance coding (i.e. utterance + detail code). There are 662 unique utterances. The difference indicates utterances that were dual-coded (i.e. two detail-level codes). No more than two codes were applied to a single utterance. For the purposes of analysis, dual-coded utterances will be treated as two utterances, as they have two distinct (but lexically insepeperable) units of meaning.

1 DATA PROFILE

df_coded%>% summarytools::dfSummary(
             plain.ascii  = FALSE,
             graph.magnif = 0.75,
             style        = "grid",
             tmp.img.dir  = "temp",
             missing.col = FALSE, 
             method = "render"
)

1.0.1 Data Frame Summary

1.0.1.1 df_coded

Dimensions: 742 x 18
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 sid
[factor]
1. 0
2. 1
3. 2
4. 3
5. 4
6. 5
7. 6
8. 7
9. 8
10. 9
[ 732 others ]
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
732 (98.7%)
742
(100.0%)
0
(0.0%)
2 pid
[factor]
1. bjs827ee1u
2. 3r2sh20ei
3. 4728sjuiz
4. 7ACC0B75
5. 92ghd48xe
6. iurmer289
7. s294hoei
8. j2719eertu2
9. lkin27js09b
10. li832lin23
[ 3 others ]
29 ( 3.9%)
103 (13.9%)
43 ( 5.8%)
28 ( 3.8%)
56 ( 7.5%)
87 (11.7%)
88 (11.9%)
82 (11.1%)
48 ( 6.5%)
51 ( 6.9%)
127 (17.1%)
742
(100.0%)
0
(0.0%)
3 PNUM
[factor]
1. P6
2. P9
3. P10
4. P2
5. P4
6. P12
7. P13
8. P5
9. P7
10. P8
[ 3 others ]
29 ( 3.9%)
103 (13.9%)
43 ( 5.8%)
28 ( 3.8%)
56 ( 7.5%)
87 (11.7%)
88 (11.9%)
82 (11.1%)
48 ( 6.5%)
51 ( 6.9%)
127 (17.1%)
742
(100.0%)
0
(0.0%)
4 uid
[factor]
1. 1
2. 2
3. 3
4. 4
5. 5
6. 6
7. 7
8. 8
9. 9
10. 10
[ 652 others ]
2 ( 0.3%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
2 ( 0.3%)
1 ( 0.1%)
1 ( 0.1%)
730 (98.4%)
742
(100.0%)
0
(0.0%)
5 TASK
[factor]
1. static
2. ixn
396 (53.4%)
346 (46.6%)
742
(100.0%)
0
(0.0%)
6 DATASET
[factor]
1. happiness
2. space
424 (57.1%)
318 (42.9%)
742
(100.0%)
0
(0.0%)
7 timestamp
[hms, difftime]
min : 622
med : 2857
max : 6900
units : secs
622 distinct values 742
(100.0%)
0
(0.0%)
8 ixn
[factor]
1. FALSE
2. TRUE
633 (85.3%)
109 (14.7%)
742
(100.0%)
0
(0.0%)
9 code_topic
[factor]
1. PROCESS
2. DATASET
3. VARIABLE
4. RELATIONSHIP
160 (21.6%)
176 (23.7%)
122 (16.4%)
284 (38.3%)
742
(100.0%)
0
(0.0%)
10 code_detail
[factor]
1. data orientation
2. data provenance
3. data size
4. distribution outlier (var
5. distribution range [min,
6. distribution shape [shape
7. distribution variance (sd
8. missing data
9. outlier (relationship)
10. plan of action
[ 8 others ]
16 ( 2.2%)
11 ( 1.5%)
9 ( 1.2%)
9 ( 1.2%)
33 ( 4.4%)
79 (10.6%)
1 ( 0.1%)
76 (10.2%)
20 ( 2.7%)
52 ( 7.0%)
436 (58.8%)
742
(100.0%)
0
(0.0%)
11 code_datatype
[factor]
1. distribution (continuous
2. distribution (categorical
3. relationship (categorical
4. relationship (categorical
5. relationship (continuous
6. relationship (multivariat
76 (17.8%)
54 (12.7%)
28 ( 6.6%)
55 (12.9%)
146 (34.3%)
67 (15.7%)
426
(57.4%)
316
(42.6%)
12 flag_story
[logical]
1. FALSE
2. TRUE
700 (94.3%)
42 ( 5.7%)
742
(100.0%)
0
(0.0%)
13 flag_correction
[logical]
1. FALSE
2. TRUE
733 (98.8%)
9 ( 1.2%)
742
(100.0%)
0
(0.0%)
14 flag_simultaneous
[logical]
1. FALSE
2. TRUE
682 (91.9%)
60 ( 8.1%)
742
(100.0%)
0
(0.0%)
15 utterance
[character]
1. [Talking about the profil
2. actually, let me see if p
3. Although we have like les
4. And are they within range
5. And confidence in governm
6. And just I want to see ho
7. NA
8. NA
9. Because it does seem like
10. Data frame. Got a bunch o
[ 652 others ]
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
722 (97.3%)
742
(100.0%)
0
(0.0%)
16 reps_group
[factor]
1. barplot
2. columns
3. columns_data_dictionary
4. data_dictionary
5. data_dictionary_dataframe
6. data_dictionary_describe
7. dataframe
8. dataframe_describe
9. dataframe_heatmap
10. dataframe_pairplot
[ 15 others ]
16 ( 2.2%)
4 ( 0.5%)
1 ( 0.1%)
59 ( 8.0%)
1 ( 0.1%)
10 ( 1.3%)
80 (10.8%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
568 (76.5%)
742
(100.0%)
0
(0.0%)
17 reps_all
[factor]
1. affect_corruption_brush_7
2. Age_CryoSleep_scatterplot
3. age_CryoSleep_ShoppingMal
4. Age_RoomService_scatterpl
5. age_roomservice_scatterpl
6. Age_RoomService_scatterpl
7. Age_ShoppingMall_scatterp
8. altair_profile_contVars_j
9. alx_barplot_df_homeplanet
10. alx_barplot_df_homeplanet
[ 245 others ]
4 ( 0.6%)
3 ( 0.4%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
2 ( 0.3%)
1 ( 0.1%)
693 (97.7%)
709
(95.6%)
33
(4.4%)
18 data_order
[factor]
1. space-first
2. happiness-first
308 (41.5%)
434 (58.5%)
742
(100.0%)
0
(0.0%)

2 UTTERANCES

2.1 by TASK

print("BY TASK")

[1] “BY TASK”

freq(df_coded$TASK, 
     cumul      = FALSE,
     headings   = FALSE,
     report.nas = FALSE,
     plain.ascii = FALSE) 
  Freq %
static 396 53.37
ixn 346 46.63
Total 742 100.00

2.2 by TASK and DATASET

#COUNT BY TASK AND DATASET
ctable(x = df_coded$TASK, 
       y = df_coded$DATASET, 
       prop = "t")  

Cross-Tabulation, Total Proportions
TASK * DATASET
Data Frame: df_coded

DATASET happiness space Total
TASK
static 256 (34.5%) 140 (18.9%) 396 ( 53.4%)
ixn 168 (22.6%) 178 (24.0%) 346 ( 46.6%)
Total 424 (57.1%) 318 (42.9%) 742 (100.0%)
#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_coded %>% 
  group_by(TASK,DATASET) %>% 
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= DATASET)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  # scale_fill_brewer(type="qual", palette = 4) +
  labs( title = "Utterances by TASK and DATASET",
        subtitle = "More utterances in STATIC; more utterances in HAPPINESS",
        x= "TASK", y = "count") + theme_minimal() 

# + theme(legend.position = "blank")

2.3 by PARTICIPANT

#COUNT BY PARTICIPANT AND TASK
ctable(x = df_coded$PNUM, 
       y = df_coded$TASK, 
       prop = "r")  

Cross-Tabulation, Row Proportions
PNUM * TASK
Data Frame: df_coded

TASK static ixn Total
PNUM
P6 11 (37.9%) 18 (62.1%) 29 (100.0%)
P9 63 (61.2%) 40 (38.8%) 103 (100.0%)
P10 30 (69.8%) 13 (30.2%) 43 (100.0%)
P2 18 (64.3%) 10 (35.7%) 28 (100.0%)
P4 28 (50.0%) 28 (50.0%) 56 (100.0%)
P12 46 (52.9%) 41 (47.1%) 87 (100.0%)
P13 60 (68.2%) 28 (31.8%) 88 (100.0%)
P5 33 (40.2%) 49 (59.8%) 82 (100.0%)
P7 29 (60.4%) 19 (39.6%) 48 (100.0%)
P8 17 (33.3%) 34 (66.7%) 51 (100.0%)
P3 24 (44.4%) 30 (55.6%) 54 (100.0%)
P1 10 (40.0%) 15 (60.0%) 25 (100.0%)
P11 27 (56.2%) 21 (43.8%) 48 (100.0%)
Total 396 (53.4%) 346 (46.6%) 742 (100.0%)
#UTTERANCES by PARTICPANT facet TASK
gf_bar( PNUM ~., fill = ~ DATASET, data = df_coded) %>% 
  gf_facet_grid(.~TASK) + 
  labs(
    title = "Utterances by Participant, Dataset and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "DATASET"
  )

2.4 by TIME

#DOTPLOT
ggplot(df_time, aes(x=rel_time, y = PNUM)) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time$TASK) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Participant Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#HISTOGRAMS BY TASK
ggplot(df_time, aes(x = rel_time)) + 
  geom_histogram(binwidth = 30,aes(y=..density..)) + 
  geom_density()+
  facet_grid(df_time$TASK) +
  theme_minimal() + labs(
    title = "Participant Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances",
  ) + theme_minimal() + theme(legend.position = "blank")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

3 TOP-CODE

3.1 by TASK

#COUNT BY TASK
ctable(x = df_coded$code_topic, 
       y = df_coded$TASK, 
       prop = "r")  

Cross-Tabulation, Row Proportions
code_topic * TASK
Data Frame: df_coded

TASK static ixn Total
code_topic
PROCESS 92 (57.5%) 68 (42.5%) 160 (100.0%)
DATASET 99 (56.2%) 77 (43.8%) 176 (100.0%)
VARIABLE 76 (62.3%) 46 (37.7%) 122 (100.0%)
RELATIONSHIP 129 (45.4%) 155 (54.6%) 284 (100.0%)
Total 396 (53.4%) 346 (46.6%) 742 (100.0%)
#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_coded %>% 
  group_by(code_topic, TASK) %>% 
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= fct_rev(code_topic))) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="qual", palette = 3) +
  labs( title = "TOPICS by TASK",
        subtitle = "",
        x= "TASK", y = "count", fill="TOPIC") + theme_minimal() 

# + theme(legend.position = "blank")

3.2 by TASK and DATASET

#DF SUMMARIZED BY TASK + DATASET
df_summary <- df_coded %>%
  group_by(code_topic, TASK,DATASET) %>%
  dplyr::summarise(
    c = n()
  )

#STACKED BAR BY TASK FACET DATASET
ggplot(df_summary, aes(x = TASK, y=c, fill= fct_rev(code_topic))) +
  facet_wrap(df_summary$DATASET) +
  geom_col() +
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") +
  scale_fill_brewer(type="qual", palette = 3) +
  labs( title = "TOPICS by TASK and DATASET",
        subtitle = "",
        x= "TASK", y = "count", fill="TOPIC") + theme_minimal()

# + theme(legend.position = "blank")

3.3 by PARTICIPANT

#COUNT BY PARTICIPANT 
ctable(x = df_coded$PNUM, 
       y = df_coded$code_topic, 
       prop = "r")  

Cross-Tabulation, Row Proportions
PNUM * code_topic
Data Frame: df_coded

code_topic PROCESS DATASET VARIABLE RELATIONSHIP Total
PNUM
P6 6 (20.7%) 5 (17.2%) 5 (17.2%) 13 (44.8%) 29 (100.0%)
P9 19 (18.4%) 36 (35.0%) 23 (22.3%) 25 (24.3%) 103 (100.0%)
P10 8 (18.6%) 11 (25.6%) 8 (18.6%) 16 (37.2%) 43 (100.0%)
P2 3 (10.7%) 8 (28.6%) 16 (57.1%) 1 ( 3.6%) 28 (100.0%)
P4 10 (17.9%) 6 (10.7%) 11 (19.6%) 29 (51.8%) 56 (100.0%)
P12 21 (24.1%) 28 (32.2%) 17 (19.5%) 21 (24.1%) 87 (100.0%)
P13 41 (46.6%) 7 ( 8.0%) 8 ( 9.1%) 32 (36.4%) 88 (100.0%)
P5 6 ( 7.3%) 18 (22.0%) 18 (22.0%) 40 (48.8%) 82 (100.0%)
P7 14 (29.2%) 14 (29.2%) 7 (14.6%) 13 (27.1%) 48 (100.0%)
P8 10 (19.6%) 7 (13.7%) 3 ( 5.9%) 31 (60.8%) 51 (100.0%)
P3 7 (13.0%) 6 (11.1%) 2 ( 3.7%) 39 (72.2%) 54 (100.0%)
P1 8 (32.0%) 4 (16.0%) 0 ( 0.0%) 13 (52.0%) 25 (100.0%)
P11 7 (14.6%) 26 (54.2%) 4 ( 8.3%) 11 (22.9%) 48 (100.0%)
Total 160 (21.6%) 176 (23.7%) 122 (16.4%) 284 (38.3%) 742 (100.0%)
#TOPICS by PARTICPANT facet TASK
gf_bar( PNUM ~., fill = ~ fct_rev(code_topic), data = df_coded) %>% 
  gf_facet_grid(.~TASK) + 
  scale_fill_brewer(type="qual", palette = 3) +
  labs(
    title = "Utterances by Participant, Dataset and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "TOPIC"
  ) + theme_minimal()

# #TOPICS by PARTICPANT facet TASK
# gf_bar( PNUM ~., fill = ~ fct_rev(code_topic), data = df_coded) %>% 
#   gf_facet_grid(DATASET~TASK) + 
#   scale_fill_brewer(type="qual", palette = 3) +
#   labs(
#     title = "Utterances by Participant, Dataset and Task",
#     subtitle = "",
#     x = "number of coded utterances",
#     y = "participant",
#     fill = "DATASET"
#   )

3.4 by TIME

#DOTPLOT
ggplot(df_time, aes(x=rel_time, y = PNUM, color=fct_rev(code_topic))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time$TASK) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Topic of Utterances over timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#HISTOGRAMS BY TASK
ggplot(df_time, aes(x = rel_time)) + 
  geom_histogram(binwidth = 30,aes(y=..density.., fill = fct_rev(code_topic), color = fct_rev(code_topic))) + 
  geom_density()+
  facet_grid(df_time$code_topic ~ df_time$TASK) +
  scale_fill_brewer(type="qual", palette = 3) +
  scale_color_brewer(type="qual", palette = 3) +
  theme_minimal() + labs(
    title = "Topic of Utterance over timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances",
    fill = "Topic"
  ) + theme_minimal() + theme(legend.position = "blank")

4 DETAIL—PROCESS

#PREP DATA FRAMES
df_process <- df_coded %>% 
  filter(code_topic=="PROCESS") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_process <- df_time %>% 
  filter(code_topic=="PROCESS") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,rel_time,code_detail)

df_summary <- df_process %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(palette="Set2") +
  labs( title = "PROCESS Utterances by TASK",
        subtitle = "",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS DOTPLOT
ggplot(df_time_process, aes(x=rel_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_process$TASK) +
  scale_color_brewer(palette="Set2") +
  theme_minimal() + labs(
    title = "PROCESS Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_process, aes(x = rel_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_process$TASK ~ df_time_process$code_detail ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal() + labs(
    title = "PROCESS Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances"
  ) + theme_minimal() + theme(legend.position = "blank")

5 DETAIL—DATASET

#PREP DATA FRAMES
df_dataset <- df_coded %>% 
  filter(code_topic=="DATASET") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_dataset <- df_time %>% 
  filter(code_topic=="DATASET") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,rel_time,code_detail)

df_summary <- df_dataset %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type = "seq", palette = 4) +
  labs( title = "DATASET Utterances by TASK",
        subtitle = "",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS DOTPLOT
ggplot(df_time_dataset, aes(x=rel_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_dataset$TASK) +
  scale_color_brewer(type = "seq", palette = 4) +
  theme_minimal() + labs(
    title = "VARIABLE Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_dataset, aes(x = rel_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_dataset$TASK ~ df_time_dataset$code_detail ) +
  scale_fill_brewer(type = "seq", palette = 4) +
  theme_minimal() + labs(
    title = "VARIABLE Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances"
  ) + theme_minimal() + theme(legend.position = "blank")

6 DETAIL—VARIABLE

#PREP DATA FRAMES
df_variable <- df_coded %>% 
  filter(code_topic=="VARIABLE") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_variable <- df_time %>% 
  filter(code_topic=="VARIABLE") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,rel_time,code_detail)

df_summary <- df_variable %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type = "seq", palette = 5) +
  labs( title = "VARIABLE Utterances by TASK",
        subtitle = "",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS DOTPLOT
ggplot(df_time_variable, aes(x=rel_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_variable$TASK) +
  scale_color_brewer(type = "seq", palette = 5) +
  theme_minimal() + labs(
    title = "VARIABLE Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_variable, aes(x = rel_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_variable$TASK ~ df_time_variable$code_detail) +
  scale_fill_brewer(type = "seq", palette = 5) +
  theme_minimal() + labs(
    title = "VARIABLE Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances"
  ) + theme_minimal() + theme(legend.position = "blank")

7 DETAIL—RELATIONSHIP

#PREP DATA FRAMES
df_relationship <- df_coded %>% 
  filter(code_topic=="RELATIONSHIP") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,code_detail)

df_time_relationship <- df_time %>% 
  filter(code_topic=="RELATIONSHIP") %>% 
  dplyr::select(pid,PNUM,TASK,DATASET,rel_time,code_detail)

df_summary <- df_relationship %>% 
  group_by(code_detail, TASK) %>% 
  dplyr::summarise(c = n())


#DETAILS BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= code_detail)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type = "seq", palette = 3) +
  labs( title = "RELATIONSHIP Utterances by TASK",
        subtitle = "",
        x= "TASK", y = "count") + theme_minimal() 

#DETAILS DOTPLOT
ggplot(df_time_relationship, aes(x=rel_time, y = PNUM, color=fct_rev(code_detail))) + 
  geom_point(alpha=0.5, size=3) +
  facet_grid(df_time_relationship$TASK) +
  scale_color_brewer(type = "seq", palette = 3) +
  theme_minimal() + labs(
    title = "RELATIONSHIP Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "Participant",
    color = "Topic"
  ) 

#DETAIL HISTOGRAMS BY TASK
ggplot(df_time_relationship, aes(x = rel_time, fill = fct_rev(code_detail))) + 
  geom_histogram(binwidth = 30) + 
  facet_grid(df_time_relationship$TASK ~ df_time_relationship$code_detail) +
  scale_fill_brewer(type = "seq", palette = 3) +
  theme_minimal() + labs(
    title = "RELATIONSHIP Utterances by timecourse of Task",
    x= "timecourse of task (seconds)", y = "frequency of utterances"
  ) + theme_minimal() + theme(legend.position = "blank")

8 REPRESENTATIONS

8.1 Representation Groups

How many representations were created?

print("BY TASK")

[1] “BY TASK”

ctable(df_coded$reps_group, df_coded$TASK,
    prop = "t",
    plain.ascii = FALSE)

8.1.1 Cross-Tabulation, Total Proportions

8.1.1.1 reps_group * TASK

Data Frame: df_coded

TASK static ixn Total
reps_group
barplot 8 ( 1.1%) 8 ( 1.1%) 16 ( 2.2%)
columns 4 ( 0.5%) 0 ( 0.0%) 4 ( 0.5%)
columns_data_dictionary 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
data_dictionary 29 ( 3.9%) 30 ( 4.0%) 59 ( 8.0%)
data_dictionary_dataframe 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
data_dictionary_describe 9 ( 1.2%) 1 ( 0.1%) 10 ( 1.3%)
dataframe 55 ( 7.4%) 25 ( 3.4%) 80 ( 10.8%)
dataframe_describe 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_heatmap 0 ( 0.0%) 1 ( 0.1%) 1 ( 0.1%)
dataframe_pairplot 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_profile 1 ( 0.1%) 2 ( 0.3%) 3 ( 0.4%)
describe 11 ( 1.5%) 13 ( 1.8%) 24 ( 3.2%)
describe_profile 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
double-profiler 13 ( 1.8%) 8 ( 1.1%) 21 ( 2.8%)
heatmap 13 ( 1.8%) 7 ( 0.9%) 20 ( 2.7%)
hist 6 ( 0.8%) 0 ( 0.0%) 6 ( 0.8%)
info 9 ( 1.2%) 6 ( 0.8%) 15 ( 2.0%)
lineplot 38 ( 5.1%) 0 ( 0.0%) 38 ( 5.1%)
Multi-view Chart 9 ( 1.2%) 50 ( 6.7%) 59 ( 8.0%)
none 17 ( 2.3%) 14 ( 1.9%) 31 ( 4.2%)
pairplot 38 ( 5.1%) 14 ( 1.9%) 52 ( 7.0%)
profile 47 ( 6.3%) 59 ( 8.0%) 106 ( 14.3%)
python 30 ( 4.0%) 30 ( 4.0%) 60 ( 8.1%)
scatterplot 48 ( 6.5%) 78 (10.5%) 126 ( 17.0%)
stripplot 6 ( 0.8%) 0 ( 0.0%) 6 ( 0.8%)
Total 396 (53.4%) 346 (46.6%) 742 (100.0%)
print("BY DATASET")

[1] “BY DATASET”

ctable(df_coded$reps_group, df_coded$DATASET,
    prop = "t",
    plain.ascii = FALSE)

8.1.2 Cross-Tabulation, Total Proportions

8.1.2.1 reps_group * DATASET

Data Frame: df_coded

DATASET happiness space Total
reps_group
barplot 3 ( 0.4%) 13 ( 1.8%) 16 ( 2.2%)
columns 1 ( 0.1%) 3 ( 0.4%) 4 ( 0.5%)
columns_data_dictionary 0 ( 0.0%) 1 ( 0.1%) 1 ( 0.1%)
data_dictionary 24 ( 3.2%) 35 ( 4.7%) 59 ( 8.0%)
data_dictionary_dataframe 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
data_dictionary_describe 7 ( 0.9%) 3 ( 0.4%) 10 ( 1.3%)
dataframe 37 ( 5.0%) 43 ( 5.8%) 80 ( 10.8%)
dataframe_describe 0 ( 0.0%) 1 ( 0.1%) 1 ( 0.1%)
dataframe_heatmap 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_pairplot 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_profile 0 ( 0.0%) 3 ( 0.4%) 3 ( 0.4%)
describe 16 ( 2.2%) 8 ( 1.1%) 24 ( 3.2%)
describe_profile 0 ( 0.0%) 1 ( 0.1%) 1 ( 0.1%)
double-profiler 3 ( 0.4%) 18 ( 2.4%) 21 ( 2.8%)
heatmap 15 ( 2.0%) 5 ( 0.7%) 20 ( 2.7%)
hist 0 ( 0.0%) 6 ( 0.8%) 6 ( 0.8%)
info 9 ( 1.2%) 6 ( 0.8%) 15 ( 2.0%)
lineplot 38 ( 5.1%) 0 ( 0.0%) 38 ( 5.1%)
Multi-view Chart 38 ( 5.1%) 21 ( 2.8%) 59 ( 8.0%)
none 20 ( 2.7%) 11 ( 1.5%) 31 ( 4.2%)
pairplot 44 ( 5.9%) 8 ( 1.1%) 52 ( 7.0%)
profile 55 ( 7.4%) 51 ( 6.9%) 106 ( 14.3%)
python 9 ( 1.2%) 51 ( 6.9%) 60 ( 8.1%)
scatterplot 102 (13.7%) 24 ( 3.2%) 126 ( 17.0%)
stripplot 0 ( 0.0%) 6 ( 0.8%) 6 ( 0.8%)
Total 424 (57.1%) 318 (42.9%) 742 (100.0%)

8.2 TOPICS + REPRESENTATIONS

df <- df_coded 

ctable(df$reps_group, df$code_topic,
    prop = "t",
    plain.ascii = FALSE)

8.2.1 Cross-Tabulation, Total Proportions

8.2.1.1 reps_group * code_topic

Data Frame: df

code_topic PROCESS DATASET VARIABLE RELATIONSHIP Total
reps_group
barplot 5 ( 0.7%) 1 ( 0.1%) 2 ( 0.3%) 8 ( 1.1%) 16 ( 2.2%)
columns 0 ( 0.0%) 2 ( 0.3%) 1 ( 0.1%) 1 ( 0.1%) 4 ( 0.5%)
columns_data_dictionary 0 ( 0.0%) 1 ( 0.1%) 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%)
data_dictionary 7 ( 0.9%) 41 ( 5.5%) 2 ( 0.3%) 9 ( 1.2%) 59 ( 8.0%)
data_dictionary_dataframe 0 ( 0.0%) 1 ( 0.1%) 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%)
data_dictionary_describe 0 ( 0.0%) 1 ( 0.1%) 7 ( 0.9%) 2 ( 0.3%) 10 ( 1.3%)
dataframe 21 ( 2.8%) 34 ( 4.6%) 7 ( 0.9%) 18 ( 2.4%) 80 ( 10.8%)
dataframe_describe 0 ( 0.0%) 1 ( 0.1%) 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_heatmap 0 ( 0.0%) 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%) 1 ( 0.1%)
dataframe_pairplot 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
dataframe_profile 1 ( 0.1%) 2 ( 0.3%) 0 ( 0.0%) 0 ( 0.0%) 3 ( 0.4%)
describe 2 ( 0.3%) 14 ( 1.9%) 3 ( 0.4%) 5 ( 0.7%) 24 ( 3.2%)
describe_profile 0 ( 0.0%) 0 ( 0.0%) 1 ( 0.1%) 0 ( 0.0%) 1 ( 0.1%)
double-profiler 2 ( 0.3%) 4 ( 0.5%) 3 ( 0.4%) 12 ( 1.6%) 21 ( 2.8%)
heatmap 5 ( 0.7%) 0 ( 0.0%) 0 ( 0.0%) 15 ( 2.0%) 20 ( 2.7%)
hist 0 ( 0.0%) 0 ( 0.0%) 3 ( 0.4%) 3 ( 0.4%) 6 ( 0.8%)
info 5 ( 0.7%) 8 ( 1.1%) 1 ( 0.1%) 1 ( 0.1%) 15 ( 2.0%)
lineplot 20 ( 2.7%) 0 ( 0.0%) 0 ( 0.0%) 18 ( 2.4%) 38 ( 5.1%)
Multi-view Chart 11 ( 1.5%) 10 ( 1.3%) 13 ( 1.8%) 25 ( 3.4%) 59 ( 8.0%)
none 16 ( 2.2%) 5 ( 0.7%) 2 ( 0.3%) 8 ( 1.1%) 31 ( 4.2%)
pairplot 10 ( 1.3%) 1 ( 0.1%) 10 ( 1.3%) 31 ( 4.2%) 52 ( 7.0%)
profile 21 ( 2.8%) 20 ( 2.7%) 50 ( 6.7%) 15 ( 2.0%) 106 ( 14.3%)
python 7 ( 0.9%) 22 ( 3.0%) 11 ( 1.5%) 20 ( 2.7%) 60 ( 8.1%)
scatterplot 27 ( 3.6%) 8 ( 1.1%) 5 ( 0.7%) 86 (11.6%) 126 ( 17.0%)
stripplot 0 ( 0.0%) 0 ( 0.0%) 0 ( 0.0%) 6 ( 0.8%) 6 ( 0.8%)
Total 160 (21.6%) 176 (23.7%) 122 (16.4%) 284 (38.3%) 742 (100.0%)

8.3 TODO PICK UP HERE

9 MODELLING

#DEFINE DATAFRAME
df <- df_coded %>% select(pid, uid, TASK, DATASET) 
  
#MOSAIC PLOT
mosaic(formula = ~DATASET + TASK, 
       data = df,
       main = "Proportion of Utterances by TASK and DATASET", 
       sub = "u = 734 utterance-codes",
       labeling = labeling_values,
       labeling_args = list(set_varnames = c(graph = "TASK",
                            datset = "DATASET")))

9.1 UTTERANCES

How much variance in number of utterances is explained DATASET, TASK and PARTICIPANT?

9.1.1 OLS Mixed Effects Models

#DEFINE DATAFRAME
df <- df_coded %>% group_by(pid, DATASET, TASK) %>% 
  dplyr::summarise( .groups = "keep",
    n_utterances = n()
  )

#NUMBER UTTERANCES predicted by DATASET + TASK | participatnt--> MIXED LINEAR REGRESSION
print("LMER, UTTERANCES ~ DATASET + TASK")
## [1] "LMER, UTTERANCES ~ DATASET + TASK"
mm1 <- lmer(n_utterances ~ DATASET + TASK+ (1|pid), data = df)
paste("Model")
## [1] "Model"
summ(mm1)
Observations 26
Dependent variable n_utterances
Type Mixed effects linear regression
AIC 199.41
BIC 205.70
Pseudo-R² (fixed effects) 0.09
Pseudo-R² (total) 0.64
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 34.11 4.23 8.07 20.73 0.00
DATASETspace -7.90 3.48 -2.27 11.00 0.04
TASKixn -3.24 3.48 -0.93 11.00 0.37
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
pid (Intercept) 10.97
Residual 8.85
Grouping Variables
Group # groups ICC
pid 13 0.61
paste("Partition Variance")
## [1] "Partition Variance"
anova(mm1)
## Type III Analysis of Variance Table with Satterthwaite's method
##         Sum Sq Mean Sq NumDF DenDF F value  Pr(>F)  
## DATASET 403.75  403.75     1    11  5.1577 0.04423 *
## TASK     67.75   67.75     1    11  0.8655 0.37218  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(mm1)
## Computing profile confidence intervals ...
##                   2.5 %    97.5 %
## .sig01         5.404680 17.974025
## .sigma         5.787943 12.644890
## (Intercept)   25.894875 42.324905
## DATASETspace -14.673480 -1.136044
## TASKixn      -10.006813  3.530623
report(mm1) #sanity check
## We fitted a linear mixed model (estimated using REML and nloptwrap optimizer)
## to predict n_utterances with DATASET and TASK (formula: n_utterances ~ DATASET
## + TASK). The model included pid as random effect (formula: ~1 | pid). The
## model's total explanatory power is substantial (conditional R2 = 0.64) and the
## part related to the fixed effects alone (marginal R2) is of 0.09. The model's
## intercept, corresponding to DATASET = happiness and TASK = static, is at 34.11
## (95% CI [25.32, 42.90], t(21) = 8.07, p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically significant and negative (beta
## = -7.90, 95% CI [-15.14, -0.67], t(21) = -2.27, p = 0.034; Std. beta = -0.55,
## 95% CI [-1.05, -0.05])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -3.24, 95% CI [-10.48, 4.00], t(21) = -0.93, p = 0.363; Std. beta = -0.22,
## 95% CI [-0.73, 0.28])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(mm1,  show.intercept = TRUE)

check_model(mm1)

#NUMBER UTTERANCES predicted by DATASET * TASK  | participatnt--> MIXED LINEAR REGRESSION
print("LMER, UTTERANCES ~ DATASET X TASK")
## [1] "LMER, UTTERANCES ~ DATASET X TASK"
mm2 <- lmer(n_utterances ~ DATASET * TASK + (1|pid), data = df)
paste("Model")
## [1] "Model"
summ(mm2)
Observations 26
Dependent variable n_utterances
Type Mixed effects linear regression
AIC 193.70
BIC 201.25
Pseudo-R² (fixed effects) 0.12
Pseudo-R² (total) 0.66
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 36.57 5.41 6.76 15.92 0.00
DATASETspace -13.24 7.96 -1.66 15.92 0.12
TASKixn -8.57 7.96 -1.08 15.92 0.30
DATASETspace:TASKixn 10.67 14.32 0.74 11.00 0.47
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
pid (Intercept) 11.25
Residual 8.85
Grouping Variables
Group # groups ICC
pid 13 0.62
paste("Partition Variance")
## [1] "Partition Variance"
anova(mm2)
## Type III Analysis of Variance Table with Satterthwaite's method
##              Sum Sq Mean Sq NumDF DenDF F value  Pr(>F)  
## DATASET      403.75  403.75     1    11  5.1577 0.04423 *
## TASK          67.75   67.75     1    11  0.8655 0.37218  
## DATASET:TASK  43.44   43.44     1    11  0.5549 0.47195  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(mm2)
## Computing profile confidence intervals ...
##                           2.5 %    97.5 %
## .sig01                 4.993856 17.494032
## .sigma                 5.787971 12.644901
## (Intercept)           26.281282 46.861575
## DATASETspace         -28.384780  1.908591
## TASKixn              -23.718114  6.575257
## DATASETspace:TASKixn -17.180459 38.513793
report(mm2) #sanity check
## We fitted a linear mixed model (estimated using REML and nloptwrap optimizer)
## to predict n_utterances with DATASET and TASK (formula: n_utterances ~ DATASET
## * TASK). The model included pid as random effect (formula: ~1 | pid). The
## model's total explanatory power is substantial (conditional R2 = 0.66) and the
## part related to the fixed effects alone (marginal R2) is of 0.12. The model's
## intercept, corresponding to DATASET = happiness and TASK = static, is at 36.57
## (95% CI [25.29, 47.85], t(20) = 6.76, p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically non-significant and negative
## (beta = -13.24, 95% CI [-29.84, 3.37], t(20) = -1.66, p = 0.112; Std. beta =
## -0.92, 95% CI [-2.07, 0.23])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -8.57, 95% CI [-25.18, 8.04], t(20) = -1.08, p = 0.294; Std. beta = -0.59,
## 95% CI [-1.75, 0.56])
##   - The effect of DATASET [space] × TASK [ixn] is statistically non-significant
## and positive (beta = 10.67, 95% CI [-19.20, 40.54], t(20) = 0.74, p = 0.465;
## Std. beta = 0.74, 95% CI [-1.33, 2.81])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(mm2,  show.intercept = TRUE)

check_model(mm2)

9.1.2 POISSON Mixed Effects Models

# 
# #NUMBER UTTERANCES predicted by TASK + DATASET  | participatnt--> POISSON MIXED LINEAR REGRESSION
# print("POISSON-MER, UTTERANCES ~ DATASET + TASK")
# pmm1 <- glmer(n_utterances ~ TASK + DATASET + (1|pid), data = df, family = "poisson")
# paste("Model")
# summ(pmm1)
# paste("Partition Variance")
# anova(pmm1)
# paste("Confidence Interval on Parameter Estimates")
# confint(pmm1)
# report(pmm1) #sanity check
# plot_model(pmm1,  show.intercept = TRUE)
# check_model(pmm1)
# 
# #NUMBER UTTERANCES predicted by TASK X DATASET  | participatnt--> POISSON MIXED LINEAR REGRESSION
# print("POISSON-MER, UTTERANCES ~ DATASET X TASK")
# pmm2 <- glmer(n_utterances ~ TASK * DATASET + (1|pid), data = df, family = "poisson")
# paste("Model")
# summ(pmm2)
# paste("Partition Variance")
# anova(pmm2)
# paste("Confidence Interval on Parameter Estimates")
# confint(pmm2)
# report(pmm2) #sanity check
# plot_model(pmm2,  show.intercept = TRUE)
# check_model(pmm2)